We should first configure where Rstudio loads packages from, we just need to run the setup script. We should also run the dependencies script which contains paths of software or reference files we’re gonna use.
#Configure R studio packages location
library(ssh)
## Linking to libssh v0.8.6
run= F
stats3 = F
source('dependencies.R')
if(stats3==F){
#session <- ssh_connect("dgrigori@stats3.sgul.ac.uk",)
}
## NULL
sr = paste0("~/Mimir",unlist(strsplit(getwd(),":"))[2])
Load Parameters + packages
library(rlang, quietly = T)
#install.packages("vctrs")
library(vctrs, quietly = T)
library(pillar, quietly = T)
library(magrittr, quietly = T)
library(dplyr, quietly = T)
library(knitr, quietly = T)
#library(stringi)
library(tidyselect, quietly = T)
library(plotly, quietly = T)
library(rprojroot, quietly = T)
library(rmarkdown, quietly = T)
library(DT, quietly = T)
#library(scales)
library(ggplot2, quietly = T)
library(kableExtra, quietly = T)
library(data.table, quietly = T)
#library(dplyr)
library(qqman, quietly = T)
library(CMplot, quietly = T)
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17763)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252
## [2] LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] CMplot_3.6.2 qqman_0.1.4 tidyselect_1.1.0 vctrs_0.3.6
## [5] rlang_0.4.10 ssh_0.7.0 data.table_1.13.6 kableExtra_1.3.1
## [9] scales_1.1.1 DT_0.17 rmarkdown_2.6 rprojroot_2.0.2
## [13] plotly_4.9.3 ggplot2_3.3.3 stringi_1.5.3 knitr_1.31
## [17] dplyr_1.0.4 magrittr_2.0.1 pillar_1.4.7
##
## loaded via a namespace (and not attached):
## [1] compiler_4.0.3 sys_3.4 tools_4.0.3 digest_0.6.27
## [5] packrat_0.5.0 jsonlite_1.7.2 evaluate_0.14 lifecycle_0.2.0
## [9] tibble_3.0.6 gtable_0.3.0 viridisLite_0.3.0 pkgconfig_2.0.3
## [13] rstudioapi_0.13 yaml_2.2.1 xfun_0.20 stringr_1.4.0
## [17] xml2_1.3.2 withr_2.4.1 httr_1.4.2 askpass_1.1
## [21] generics_0.1.0 htmlwidgets_1.5.3 webshot_0.5.2 grid_4.0.3
## [25] calibrate_1.7.7 glue_1.4.2 R6_2.5.0 purrr_0.3.4
## [29] tidyr_1.1.2 MASS_7.3-53 credentials_1.3.0 ellipsis_0.3.1
## [33] htmltools_0.5.1.1 rvest_0.3.6 colorspace_2.0-0 openssl_1.4.3
## [37] lazyeval_0.2.2 munsell_0.5.0 crayon_1.4.0
Peripheral blood from 148 lipoedema patients of white British ethnicity were collected, DNA extracted and genotyped in two batches using Illumina Infinium_CoreExome-24_v1-2 (Batch1) and Infinium_Core-24_v1-2-a1 (Batch2) single nucleotide polymorphism (SNP) chips, respectively.
sample_basename = "Input/Batch1_final/batch1_updated_all_prefixed"
dir = dirname(sample_basename)
qcdir = file.path(dir, "Qc/")
finalname = "lipoedema1_clean"
dir.create(dir)
dir.create(qcdir)
Run PLINK Linux commands to:
1)Generate the missing rate/calling rate, heterozygocity and sex checking files
./resources/software/plink1.9/plink –bfile Input/Batch1_final/batch1_updated_all_prefixed –missing –het –check-sex –out Input/Batch1_final/Qc/batch1_updated_all_prefixed
2)Prune SNPs in relative LD so we can correctly calculate IBD
./resources/software/plink1.9/plink –bfile Input/Batch1_final/batch1_updated_all_prefixed –indep-pairwise 50 5 0.5 –out Input/Batch1_final/Qc/batch1_updated_all_prefixed
3)IBD on pruned variants
./resources/software/plink1.9/plink –bfile Input/Batch1_final/batch1_updated_all_prefixed –maf 0.05 –genome –out Input/Batch1_final/Qc/batch1_updated_all_prefixed –extract Input/Batch1_final/Qc/batch1_updated_all_prefixed.prune.in
Import all the files that plink generated into R. Then we can plot and do some statistical analyses on them.
#Define the file names
imiss = file.path(qcdir,paste0(basename(sample_basename),'.imiss'))
het = file.path(qcdir,paste0(basename(sample_basename),'.het'))
sexcheck = file.path(qcdir,paste0(basename(sample_basename),'.sexcheck'))
ibd = file.path(qcdir,paste0(basename(sample_basename),'.genome'))
# Stop the script if the data file cannot be found in the current working directory:
stopifnot(file.exists(imiss))
stopifnot(file.exists(het))
stopifnot(file.exists(sexcheck))
stopifnot(file.exists(ibd))
#Load the actual data into R
miss_tab <- read.table(file=imiss, header=T, stringsAsFactors = FALSE)
het_tab <- read.table(file=het, header=T, stringsAsFactors = FALSE)
sexchk_tab <- read.table(file=sexcheck, header=T, stringsAsFactors = FALSE)
ibd_tab <- read.table(file=ibd, header=T, stringsAsFactors = FALSE)
Autosomal Heterozygocity = (Number of autosomal genotypes - Observed number of homozygotes)/Number of autosomal genotypes
Calling Rate = 1-(Number of missing genotype calls/ Number of valid calls)
Outliers: Samples with Heterozygocity < mean-3SD heterozygocity calling rate
Samples with Heterozygocity > mean+3SD heterozygocity calling rate
Samples with Calling rate < 0.97
#Calculate the heterozygocity rate
het_rate = (het_tab$N.NM. - het_tab$O.HOM.)/het_tab$N.NM.
#Calculate the genotype calling rate
cal_rate = 1-miss_tab$F_MISS
names(het_rate) = het_tab$IID
names(cal_rate) = miss_tab$IID
#Name the outliers
outliers = unique(c(names(cal_rate[cal_rate < 0.97]), names(het_rate[het_rate > mean(het_rate)+(3*sd(het_rate)) | het_rate < mean(het_rate)-(3*sd(het_rate))])))
outliers_rates = data.frame(Name=outliers, Call=cal_rate[outliers], Het=het_rate[outliers])
bline_outliers = unique(c(names(cal_rate[cal_rate < 0.975]), names(het_rate[het_rate > mean(het_rate)+(2*sd(het_rate)) | het_rate < mean(het_rate)-(2*sd(het_rate))])))
bline_outliers_rates = data.frame(Name=bline_outliers, Call=cal_rate[bline_outliers], Het=het_rate[bline_outliers])
#Draw the heterozygocity to calling rate plot
hc_plot = ggplot(miss_tab, aes(x=het_rate, y=cal_rate)) +
geom_point(size=2, shape=21) +
labs(x="Autosomal Heterozygocity Rate", y = "Calling Rate") +
theme_classic() +
geom_vline(xintercept=mean(het_rate)+(3*sd(het_rate)), colour='green', linetype = "longdash", size=0.2) +
geom_vline(xintercept=mean(het_rate)-(3*sd(het_rate)), colour='green', linetype = "longdash", size=0.2) +
geom_hline(yintercept=0.97, colour='gray', linetype = "longdash", size=0.2) #+
#geom_text(data=bline_outliers_rates, aes(x=Het, y=Call, label=Name), hjust=+1.2, vjust=0)
outliers = outliers
hc_plot
Grey horizontal line: Calling Rate=0.97 threshold
Green vertical line: ±3 SD deviation from the heterozygosity rate mean of the samples threshold
3 Outliers (will be excluded)
ibd_tab = ibd_tab[!(ibd_tab$IID1 %in% outliers | ibd_tab$IID2 %in% outliers),]
sexchk_tab = sexchk_tab[!sexchk_tab$IID %in% outliers,]
Can our genotype calls correctly predict the sex of our samples?
This is a QC step involving the calculation of the F inbreeding coefficient of the X chromosome. Under sex-linkage, inbreeding only causes an increase in the frequency of homozygous recessives in females. There is no meaning to the inbreeding coefficient of a male since he is hemizygous (i.e. he has only one X chromosome) and is, therefore, not affected by inbreeding. Check here for more information. Therefore, according to plink we are expecting:
F < 0.2 for females
F > 0.6 for males
#Get the F coefficient
sex_f = sexchk_tab$F
rownames(sexchk_tab)=paste0(sexchk_tab$FID,"-",sexchk_tab$IID)
#Create the plot
sex_f_color = ifelse(sexchk_tab$PEDSEX==sexchk_tab$SNPSEX, 'pink', 'grey')
barplot(sexchk_tab$F, col = sex_f_color, main='',
xlab='F inbreeding coefficient', ylab='No. of samples')
legend('bottomright', legend=c('True Females', 'Females predicted as Males'),
col=c('pink','gray'), lwd=10)
According to this histogram, all of our samples had an F coefficient less than 0.2 showing that our samples are reliably female samples.
Create a file of these IIDs and their FIDs which will be excluded. This file will be used with the –remove plink command.
plink_exclude = miss_tab[miss_tab$IID %in% outliers,c('FID', 'IID')]
if(length(outliers)!=nrow(plink_exclude)){stop()}
#write.table(plink_exclude, 'tempexcIDS.txt', sep='\t', row.names = FALSE, quote=FALSE)
We need to remove all four samples we decided to exclude from the analysis:
./resources/software/plink1.9/plink –bfile Input/Batch1_final/batch1_updated_all_prefixed –geno 0.05 –maf 0.01 –hwe 1e-6 midp –remove tempexcIDS.txt –make-bed –out Input/Batch1_final/lipoedema1_clean; rm temp*
sample_basename = "Input/Batch2_final/batch2_updated_all_prefixed"
dir = dirname(sample_basename)
qcdir = file.path(dir, "Qc/")
finalname = "lipoedema2_clean"
dir.create(dir)
## Warning in dir.create(dir): 'Input\Batch2_final' already exists
dir.create(qcdir)
## Warning in dir.create(qcdir): 'Input\Batch2_final\Qc' already exists
Run PLINK Linux commands to:
1)Generate the missing rate/calling rate, heterozygocity and sex checking files
./resources/software/plink1.9/plink –bfile Input/Batch2_final/batch2_updated_all_prefixed –missing –het –check-sex –out Input/Batch2_final/Qc/batch2_updated_all_prefixed
2)Prune SNPs in relative LD so we can correctly calculate IBD
./resources/software/plink1.9/plink –bfile Input/Batch2_final/batch2_updated_all_prefixed –indep-pairwise 50 5 0.5 –out Input/Batch2_final/Qc/batch2_updated_all_prefixed
3)IBD on pruned variants
./resources/software/plink1.9/plink –bfile Input/Batch2_final/batch2_updated_all_prefixed –maf 0.05 –genome –out Input/Batch2_final/Qc/batch2_updated_all_prefixed –extract Input/Batch2_final/Qc/batch2_updated_all_prefixed.prune.in
Import all the files that plink generated into R. Then we can plot and do some statistical analyses on them.
#Define the file names
imiss = file.path(qcdir,paste0(basename(sample_basename),'.imiss'))
het = file.path(qcdir,paste0(basename(sample_basename),'.het'))
sexcheck = file.path(qcdir,paste0(basename(sample_basename),'.sexcheck'))
ibd = file.path(qcdir,paste0(basename(sample_basename),'.genome'))
# Stop the script if the data file cannot be found in the current working directory:
stopifnot(file.exists(imiss))
stopifnot(file.exists(het))
stopifnot(file.exists(sexcheck))
stopifnot(file.exists(ibd))
#Load the actual data into R
miss_tab <- read.table(file=imiss, header=T, stringsAsFactors = FALSE)
het_tab <- read.table(file=het, header=T, stringsAsFactors = FALSE)
sexchk_tab <- read.table(file=sexcheck, header=T, stringsAsFactors = FALSE)
ibd_tab <- read.table(file=ibd, header=T, stringsAsFactors = FALSE)
Autosomal Heterozygocity = (Number of autosomal genotypes - Observed number of homozygotes)/Number of autosomal genotypes
Calling Rate = 1-(Number of missing genotype calls/ Number of valid calls)
Outliers: Samples with Heterozygocity < mean-3SD heterozygocity calling rate
Samples with Heterozygocity > mean+3SD heterozygocity calling rate
Samples with Calling rate < 0.97
#Calculate the heterozygocity rate
het_rate = (het_tab$N.NM. - het_tab$O.HOM.)/het_tab$N.NM.
#Calculate the genotype calling rate
cal_rate = 1-miss_tab$F_MISS
names(het_rate) = het_tab$IID
names(cal_rate) = miss_tab$IID
#Name the outliers
outliers = unique(c(names(cal_rate[cal_rate < 0.97]), names(het_rate[het_rate > mean(het_rate)+(3*sd(het_rate)) | het_rate < mean(het_rate)-(3*sd(het_rate))])))
outliers_rates = data.frame(Name=outliers, Call=cal_rate[outliers], Het=het_rate[outliers])
bline_outliers = unique(c(names(cal_rate[cal_rate < 0.975]), names(het_rate[het_rate > mean(het_rate)+(2*sd(het_rate)) | het_rate < mean(het_rate)-(2*sd(het_rate))])))
bline_outliers_rates = data.frame(Name=bline_outliers, Call=cal_rate[bline_outliers], Het=het_rate[bline_outliers])
#Draw the heterozygocity to calling rate plot
hc_plot = ggplot(miss_tab, aes(x=het_rate, y=cal_rate)) +
geom_point(size=2, shape=21) +
labs(x="Autosomal Heterozygocity Rate", y = "Calling Rate") +
theme_classic() +
geom_vline(xintercept=mean(het_rate)+(3*sd(het_rate)), colour='green', linetype = "longdash", size=0.2) +
geom_vline(xintercept=mean(het_rate)-(3*sd(het_rate)), colour='green', linetype = "longdash", size=0.2) +
geom_hline(yintercept=0.97, colour='gray', linetype = "longdash", size=0.2) #+
#geom_text(data=bline_outliers_rates, aes(x=Het, y=Call, label=Name), hjust=+1.2, vjust=0)
hc_plot
outliers = outliers
Grey horizontal line: Calling Rate=0.97 threshold Green vertical line: ±3 SD deviation from the heterozygosity rate mean of the samples threshold 2 Outliers (will be excluded)
ibd_tab = ibd_tab[!(ibd_tab$IID1 %in% outliers | ibd_tab$IID2 %in% outliers),]
sexchk_tab = sexchk_tab[!sexchk_tab$IID %in% outliers,]
Can our genotype calls correctly predict the sex of our samples?
This is a QC step involving the calculation of the F inbreeding coefficient of the X chromosome. Under sex-linkage, inbreeding only causes an increase in the frequency of homozygous recessives in females. There is no meaning to the inbreeding coefficient of a male since he is hemizygous (i.e. he has only one X chromosome) and is, therefore, not affected by inbreeding. Check here for more information. Therefore, according to plink we are expecting:
F < 0.2 for females
F > 0.6 for males
#Get the F coefficient
sex_f = sexchk_tab$F
rownames(sexchk_tab)=paste0(sexchk_tab$FID,"-",sexchk_tab$IID)
#Create the plot
sex_f_color = ifelse(sexchk_tab$PEDSEX==sexchk_tab$SNPSEX, 'pink', 'grey')
barplot(sexchk_tab$F, col = sex_f_color, main='',
xlab='F inbreeding coefficient', ylab='No. of samples')
legend('bottomright', legend=c('True Females', 'Females predicted as Males'),
col=c('pink','gray'), lwd=10)
According to this histogram, all of our samples had an F coefficient less than 0.2 showing that our samples are reliably female samples.
Create a file of these IIDs and their FIDs which will be excluded. This file will be used with the –remove plink command.
plink_exclude = miss_tab[miss_tab$IID %in% outliers,c('FID', 'IID')]
if(length(outliers)!=nrow(plink_exclude)){stop()}
#write.table(plink_exclude, 'tempexcIDS.txt', sep='\t', row.names = FALSE, quote=FALSE)
We need to remove all samples we decided to exclude from the analysis:
./resources/software/plink1.9/plink –bfile Input/Batch2_final/batch2_updated_all_prefixed –geno 0.05 –maf 0.01 –hwe 1e-6 midp –remove tempexcIDS.txt –make-bed –out Input/Batch2_final/lipoedema2_clean; rm temp*
##Where are we going to run our analysis?
b1 = "Input/Batch1_final/lipoedema1_clean"
b2 = "Input/Batch2_final/lipoedema2_clean"
sample_basename = "Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc"
dir = dirname(sample_basename)
qcdir = file.path(dir, "Qc/")
qc2dir = file.path(qcdir, "Qc_arrays_inconsistency/")
finalname = "lipoedema_clean"
masterfile = "Input/Sample_info/kris_dx"
#lmaster = read.table(masterfile, sep="\t",stringsAsFactors = F)
#diagnoses <- lmaster$DIAGNOSIS.CONF.
#names(diagnoses) <- lmaster$D.Number
dir.create(dir, showWarnings = FALSE)
dir.create(qcdir, showWarnings = FALSE)
dir.create(qc2dir, showWarnings = FALSE)
Load the bim and fam files of the plink filesets we want to merge
b1_bim = data.frame(fread(paste0(b1,".bim")), stringsAsFactors = FALSE)
b2_bim = data.frame(fread(paste0(b2,".bim")), stringsAsFactors = FALSE)
b1_fam = data.frame(fread(paste0(b1,".fam")), stringsAsFactors = FALSE)
b2_fam = data.frame(fread(paste0(b2,".fam")), stringsAsFactors = FALSE)
rownames(b1_bim)=b1_bim$V2
rownames(b2_bim)=b2_bim$V2
#Add a unique genomic position identifier (pid) with format [no of chromosome]chr[genomic coordinate]
b1_bim$pid = paste0(b1_bim$V1,'chr',b1_bim$V4)
b2_bim$pid = paste0(b2_bim$V1,'chr',b2_bim$V4)
We need to only keep the common variants between the two batches. Variants not covered in both filesets will generate unwanted biases and batch effects.
Keep in Batch1 only the variants which are shared between Batch2 and Batch1
less Input/Batch2_final/lipoedema2_clean.bim | awk ‘{print $2}’ > tempIDS.txt; ./resources/software/plink1.9/plink –bfile Input/Batch1_final/lipoedema1_clean –extract tempIDS.txt –make-bed –out Merged/Lipoedema_Batches_final/b1.shared
Keep in Batch2 only the variants which are shared between Batch2 and Batch1
less Input/Batch1_final/lipoedema1_clean.bim | awk ‘{print $2}’ > tempIDS.txt; ./resources/software/plink1.9/plink –bfile Input/Batch2_final/lipoedema2_clean –extract tempIDS.txt –make-bed –out Merged/Lipoedema_Batches_final/b2.shared
1)Merge Batch2 and Filtered Batch1 (This step will fail due to strand flipping issues but it will create the usefull missnp file which will be used for strand flipping.)
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b2.shared –bmerge Merged/Lipoedema_Batches_final/b1.shared
2)Use the missnp file to filter flip the problematic snps in batch1 and then redo the merging.
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1.shared –flip plink.missnp –make-bed –out Merged/Lipoedema_Batches_final/b1.shared.flipped
Now we can actually merge batch2 with the flipped version of batch1 3)Merge the flipped batch1 with the batch2 data-set
In this step WE ALSO REMOVE NON ACGT SNPS from the merged dataset
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b2.shared –bmerge Merged/Lipoedema_Batches_final/b1.shared.flipped –merge-mode 1 –snps-only just-acgt –make-bed –out Merged/Lipoedema_Batches_final/b1b2merged
Merge is now completed.
T -> A, A -> T, G -> C, C -> G SNPs are particularly hard to resolve when merging datasets. In this case we are going to remove them from the merged dataset.
## 652 T -> A, A -> T, G -> C, C -> G SNPs are being removed:
## kgp776941, rs4646515, rs307354, rs2843130, rs11121382, rs848578, kgp468659, GA000342, rs665691, rs11247703, rs501832, rs837398, rs2297812, rs10788882, rs671108, rs7523104, kgp12171305, rs6588109, rs549148, rs10489626, rs17131479, kgp7261306, rs12730292, rs3738573, rs12144715, rs2503243, rs12749687, rs3843306, rs12030843, kgp7546508, rs1520668, rs2786529, rs309087, rs1124427, rs521734, kgp3155172, kgp8762234, kgp6470243, rs2488429, kgp1092768, kgp10847070, rs3136701, kgp1727928, kgp2698943, rs1212352, rs6666258, rs11537583, rs4147595, rs2236866, rs4987389, kgp6541618, kgp1999495, kgp7005320, rs17649050, rs859715, rs609521, rs12568050, rs832174, rs6693954, rs2243158, rs12130212, rs7524203, rs4149230, rs10916025, rs369909, rs514230, rs10926556, rs3102947, rs478222, rs4666002, kgp522146, kgp1097343, rs1031261, kgp3277363, rs891706, rs3816182, rs17035884, rs12620679, rs4233949, rs12713268, rs10184151, rs2234500, rs2916519, rs867529, rs2314398, rs3748930, rs13151, rs1402467, rs2276561, rs11123170, rs4513299, rs10207628, rs13025959, rs1050900, kgp5104289, rs6735208, rs6736104, rs6741949, rs1521527, rs1993308, kgp11638042, rs11757, rs2037799, rs12476147, rs10184895, rs7582694, rs2176528, rs12693898, rs17448457, rs4675644, rs2229571, rs12987009, rs1983210, rs2047136, rs16824283, rs13395911, kgp7355471, rs1042640, kgp7359081, rs4305276, rs1003972, rs402675, rs2279975, rs271066, rs2290159, rs7640928, rs4858649, rs7630967, kgp6063557, rs7372943, rs2271087, rs1468542, rs1005678, rs10490778, rs7433808, rs13069000, rs13069049, rs11713321, rs12486865, rs1479371, rs2276872, rs9811423, kgp11508564, rs4677948, rs10512627, kgp12182453, rs8649, rs5013525, rs4894410, rs9832727, kgp2007953, rs4149496, rs10935807, rs6798100, rs7628548, rs11920090, rs7430671, rs10937329, rs17505102, rs6815464, rs11734132, rs3775948, rs11733056, rs1972463, rs13101978, rs10014198, rs279845, rs726967, rs4148271, kgp6788842, rs3775782, rs13133166, rs344141, rs931605, rs578017, rs10222732, rs10019009, rs2725220, kgp3414215, kgp2808767, rs17219837, rs4147541, 401070, kgp3087994, kgp11897209, rs2306986, kgp1506779, kgp1489278, rs2132845, rs10028213, rs1466662, rs17046216, rs7697970, rs4241816, rs1593, rs1062547, rs7447815, rs4147775, rs7702187, rs3822410, rs4701523, rs12523160, rs1063499, rs7709645, rs2305962, rs164572, rs17663555, rs961098, kgp3636818, kgp1037316, rs17513503, rs7705033, kgp775040, rs274547, rs2548966, rs744674, rs12653537, rs3792796, rs13358864, rs2278255, rs2070998, rs1993554, rs3798713, kgp641225, rs7747752, rs2010190, rs198843, rs9467750, kgp5879917, rs454182, rs7743296, rs209151, rs3094548, rs1010408, rs3094736, rs1611737, rs9261578, rs1264540, rs2534823, rs3132613, rs4713422, rs2233967, rs130078, rs34282959, rs9266399, rs7775117, rs2523453, rs2523685, rs3828890, rs3853601, rs9332739, rs6455, rs9267845, rs1041885, rs35265698, rs9271300, kgp26489400, kgp12191290, kgp1963903, kgp7884518, kgp2881793, rs4713605, rs6913896, rs734181, rs9394159, rs831510, kgp5152142, rs4714750, kgp597715, rs2206271, rs9474433, rs561930, rs9446917, rs13202860, rs11755527, rs211221, kgp9903653, rs7760535, rs619203, rs6569474, rs9478858, kgp10632871, exm592133, rs662138, rs3818678, kgp7801610, rs3127594, rs2504916, kgp648939, rs12214416, rs7772437, rs4252105, GA006079, rs10275044, rs10277115, rs10245377, rs852499, rs6967241, kgp3981052, kgp4690887, kgp4722850, rs324981, rs1065647, rs28637820, rs884103, rs10245531, rs4148830, kgp5355388, rs2888611, rs17064, kgp1123831, rs705382, rs10487131, kgp3815960, rs7801803, rs4727338, kgp12362128, rs4729656, rs41521, rs6958498, rs1404697, rs4148686, rs4609139, rs2598291, rs194150, rs2240395, rs7781853, rs6944935, rs10282008, rs10263087, rs3800855, rs2407314, rs930557, rs6984305, rs1484642, rs12458, rs2460338, rs437548, rs7833268, rs2614091, rs2439302, rs12677427, rs2555588, rs2380571, rs10100101, rs2167801, rs7007970, rs2875931, rs2001945, kgp1315012, rs9644251, rs6558295, rs6474795, rs3731211, rs4978053, rs7468614, rs3043, kgp3137687, rs914715, kgp10939539, rs10780660, rs10868406, rs10116326, rs10118939, rs10819937, rs9409038, rs3824479, rs1929842, kgp2407207, kgp27322098, rs3204145, rs1327796, rs3818764, rs7852872, rs7039505, rs7851693, rs2073873, rs11103473, rs12722605, rs2501677, rs4748140, rs4748309, rs12258967, rs4747442, rs1148186, rs4590817, kgp5504085, rs1871452, rs754466, kgp12472133, kgp10638206, rs7923151, rs3758383, kgp10852535, kgp7668749, rs1853205, rs28399513, kgp537287, rs17222723, kgp7613533, rs521674, rs2065779, rs12413624, rs1999628, rs7923707, kgp5910402, rs4752781, rs689, rs163182, rs1026231, rs10500633, rs2722769, rs11022131, rs11022257, rs11023332, rs1793003, rs391317, rs2074040, rs4148904, rs12285276, rs2070852, rs10501320, rs1945213, rs174479, rs2276299, kgp5311770, rs504915, rs10896027, kgp4375382, rs2276288, rs597480, rs7396702, rs10830962, rs10830989, rs11021436, rs633185, rs516091, rs4938016, rs17118846, rs12225230, rs4938642, rs11218350, rs11222084, rs4765905, rs4765913, rs11053646, rs4149192, rs4149197, rs10841684, kgp277759, rs5488, kgp6443294, kgp11801419, rs864360, rs10842262, rs12819160, rs7953528, rs3759302, rs11564148, rs736825, rs2638315, rs941207, rs1495377, rs11178997, rs10879392, rs2365919, rs7300275, rs12423247, rs4584635, rs4766578, rs10849893, rs2388082, rs12323063, rs3812883, rs3736830, rs1063181, rs287398, rs9543067, rs1751051, rs279942, kgp8633405, rs12873360, rs4773279, kgp9114043, rs2273394, rs12879346, rs9323124, rs12436916, rs1061108, rs1997896, rs2025009, rs2239557, rs2232700, rs61280460, rs2069590, rs11628318, rs3178152, rs2273703, rs4906695, rs2178004, rs11635685, rs7166580, rs9325, kgp9514200, rs3204689, rs4775041, rs8034331, rs3743171, rs4776970, rs17279860, rs1874953, rs17875563, rs2521501, rs7165226, GA030783, kgp3881218, rs7168353, rs970843, kgp11779065, rs1005190, rs323043, rs11076863, rs11643447, rs4148344, kgp799675, rs212090, kgp16452123, kgp5026912, rs7190447, rs9924528, rs11649653, kgp4393797, rs12447652, rs2917666, rs4788815, rs16973585, rs7186908, kgp9244206, kgp961638, kgp6232103, kgp6429313, rs11149631, rs1060253, rs1056707, rs12051548, rs12449580, rs4151120, rs4462665, rs4985726, kgp7846805, rs2228100, rs9908604, rs8073060, rs11653310, rs16971217, rs2293152, rs17651507, rs199515, rs12452766, rs11868380, rs12451254, rs11658329, rs11653989, rs7501742, rs2853533, rs8090011, kgp9021777, rs1941059, kgp1984670, rs567458, rs10853545, rs125555, rs1037757, rs8259, rs2074442, rs1026466, rs2446195, rs428453, rs2230199, rs2161468, rs10419865, rs2738446, rs714772, rs4646529, rs2239367, rs10418069, kgp6160768, rs12977516, rs2305802, rs773930, rs401502, rs11671559, rs2967456, rs28493229, kgp5448752, rs11547806, rs365745, rs519113, rs7259004, rs6966, rs1064202, rs296365, rs2307279, rs10417472, rs3829655, rs8105668, rs4813043, rs1053783, rs6040399, rs2250889, rs12617, kgp3493774, rs2869343, rs7271624, rs6026990, rs2032297, rs198887, rs9983044, rs25678, rs2839398, kgp10445119, rs4148111, rs6586345, kgp11497610, kgp6830733, kgp2318663, rs17004785, kgp23307465, rs165656, kgp11396666, kgp1866281, rs4820255, rs228941, rs229526, rs6002455, kgp3971415, rs2071885, rs738409, rs6007344, kgp8465547, kgp9006041, kgp6121392, kgp2049186, rs3087965, rs17312163, rs3810682, rs10521351, rs1017682, rs1494481, rs1180895, rs1734791, kgp22775108
Use plink to remove these SNPs
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged –exclude tempIDS.txt –make-bed –out Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg
1) First we extract only good quality SNPs from the merged dataset (maf, geno, hwe filters) and export the resulting dataset as vcf file
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg –maf 0.01 –geno 0.05 –hwe 1e-6 midp –recode vcf –out Merged/Lipoedema_Batches_final/Qc/Qc_arrays_inconsistency/b1b2merged.no_at-cg.maf.geno.hwe
2) Actual check for inconsistencies between the 2 lipoedema arrays
To avoid batch effect generated by genotyping the lipoedema samples in two slightly different SNP arrays, 22 samples were genotyped in both batches.
#Load the VCF
vcf = data.frame(fread(file.path(qc2dir,paste0("b1b2merged.no_at-cg.maf.geno.hwe",".vcf"))),stringsAsFactors = F)
b1sams = colnames(vcf)[startsWith(colnames(vcf),"B1_")]
b2sams = colnames(vcf)[startsWith(colnames(vcf),"B2_")]
names(b1sams) = unlist(lapply(colnames(vcf)[startsWith(colnames(vcf),"B1_")], function(x) unlist(strsplit(x, "_"))[3]))
names(b2sams) = unlist(lapply(colnames(vcf)[startsWith(colnames(vcf),"B2_")], function(x) unlist(strsplit(x, "_"))[3]))
b1_shared = c(b1sams[which(names(b1sams) %in% names(b2sams))],b1sams["D116479"])
b2_shared = c(b2sams[which(names(b2sams) %in% names(b1sams))],b2sams["D116497"])
#Check for inconsistent IDs per samples
incids = unlist(lapply(c(1:length(b1_shared)), function(x) vcf[(vcf[,b1_shared[x]]!=vcf[,b2_shared[x]] & vcf[,b1_shared[x]]!="./." & vcf[,b2_shared[x]]!="./."),]$ID))
incsnps = unique(unlist(incids))
#Write all the inconsistent SNPs on a txt file which we will use to exclude them
#write.table(incsnps, paste0(qcdir,'/incIDS.txt'), quote=FALSE, row.names = FALSE, col.names = FALSE)
#write.table(incsnps, 'tempIDS.txt', quote=FALSE, row.names = FALSE, col.names = FALSE)
#Write inconsistent tables for illumina
verincsnps = names(which(sort(table(unlist(incids)))>3))
vcfb1 = vcf[vcf$ID %in% verincsnps, c(colnames(vcf)[1:9],b1_shared)]
colnames(vcfb1) = unlist(lapply(colnames(vcfb1), function(x) tail(unlist(strsplit(x,"_")),n=1)))
vcfb2 = vcf[vcf$ID %in% verincsnps, c(colnames(vcf)[1:9],b2_shared)]
colnames(vcfb2) = unlist(lapply(colnames(vcfb2), function(x) tail(unlist(strsplit(x,"_")),n=1)))
#write.csv(vcfb1, file.path(qc2dir,"inconsistent_snps_b1.csv"), quote = F, row.names = F)
#write.csv(vcfb2, file.path(qc2dir,"inconsistent_snps_b2.csv"), quote = F, row.names = F)
cat(paste0(length(unique(c(vcfb1$ID,vcfb2$ID)))," SNPs showing inconsistency between the 2 arrays were removed: "))
## 4 SNPs showing inconsistency between the 2 arrays were removed:
cat(paste(unique(c(vcfb1$ID,vcfb2$ID)),collapse=", "))
## rs2382444, rs10904494, rs8015827, rs2248797
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg –exclude tempIDS.txt –make-bed –out Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc
lipo_to_keep = read.table("Input/Sample_info/lipoedema_samples_touse.txt",stringsAsFactors = F, header=T)
lipo_excludes = mfam[!mfam$V2 %in% lipo_to_keep$DNumber,c(1,2)]
#write.table(lipo_excludes, file.path(dir,'lipo_excludes.txt'), sep='\t', row.names = FALSE, quote=FALSE)
new_sample_basename = "Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc.true_lipo"
We need to remove all samples we decided to exclude from the analysis:
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc –geno 0.05 –maf 0.01 –hwe 1e-6 midp –remove Merged/Lipoedema_Batches_final/lipo_excludes.txt –make-bed –out Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc.true_lipo; rm temp*
sample_basename = new_sample_basename
1)Generate the missing rate/calling rate, heterozygocity and sex checking files
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc.true_lipo –missing –het –check-sex –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo
2)Prune SNPs in relative LD so we can correctly calculate IBD
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc.true_lipo –indep-pairwise 50 5 0.5 –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo
3)IBD on pruned variants
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc.true_lipo –maf 0.01 –genome –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo –extract Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.prune.in
Import all the files that plink generated into R. Then we can plot and do some statistical analyses on them.
#Define the file names
imiss = file.path(qcdir,paste0(basename(sample_basename),'.imiss'))
het = file.path(qcdir,paste0(basename(sample_basename),'.het'))
sexcheck = file.path(qcdir,paste0(basename(sample_basename),'.sexcheck'))
ibd = file.path(qcdir,paste0(basename(sample_basename),'.genome'))
# Stop the script if the data file cannot be found in the current working directory:
stopifnot(file.exists(imiss))
stopifnot(file.exists(het))
stopifnot(file.exists(sexcheck))
stopifnot(file.exists(ibd))
#Load the actual data into R
miss_tab <- read.table(file=imiss, header=T, stringsAsFactors = FALSE)
het_tab <- read.table(file=het, header=T, stringsAsFactors = FALSE)
sexchk_tab <- read.table(file=sexcheck, header=T, stringsAsFactors = FALSE)
ibd_tab <- read.table(file=ibd, header=T, stringsAsFactors = FALSE)
ibd_tab$ID1 = paste(ibd_tab$FID1,ibd_tab$IID1,sep="-")
ibd_tab$ID2 = paste(ibd_tab$FID2,ibd_tab$IID2,sep="-")
sexchk_tab$ID = paste(sexchk_tab$FID,sexchk_tab$IID,sep="-")
Autosomal Heterozygocity = (Number of autosomal genotypes - Observed number of homozygotes)/Number of autosomal genotypes
Calling Rate = 1-(Number of missing genotype calls/ Number of valid calls)
Outliers: Samples with Heterozygocity < mean-3SD heterozygocity calling rate
Samples with Heterozygocity > mean+3SD heterozygocity calling rate
Samples with Calling rate < 0.97
#Calculate the heterozygocity rate
het_rate = (het_tab$N.NM. - het_tab$O.HOM.)/het_tab$N.NM.
#Calculate the genotype calling rate
cal_rate = 1-miss_tab$F_MISS
names(het_rate) = paste(het_tab$FID,het_tab$IID,sep="-")
names(cal_rate) = paste(miss_tab$FID,miss_tab$IID,sep="-")
ibd_tab$CAL1 = cal_rate[ibd_tab$ID1]
ibd_tab$CAL2 = cal_rate[ibd_tab$ID2]
#Name the outliers
outliers = unique(c(names(cal_rate[cal_rate < 0.97]), names(het_rate[het_rate > mean(het_rate)+(3*sd(het_rate)) | het_rate < mean(het_rate)-(3*sd(het_rate))])))
outliers_rates = data.frame(Name=outliers, Call=cal_rate[outliers], Het=het_rate[outliers])
bline_outliers = unique(c(names(cal_rate[cal_rate < 0.975]), names(het_rate[het_rate > mean(het_rate)+(2*sd(het_rate)) | het_rate < mean(het_rate)-(2*sd(het_rate))])))
bline_outliers_rates = data.frame(Name=bline_outliers, Call=cal_rate[bline_outliers], Het=het_rate[bline_outliers])
#Draw the heterozygocity to calling rate plot
hc_plot = ggplot(miss_tab, aes(x=het_rate, y=cal_rate)) +
geom_point(size=2, shape=21) +
labs(x="Autosomal Heterozygocity Rate", y = "Calling Rate") +
theme_classic() +
geom_vline(xintercept=mean(het_rate)+(3*sd(het_rate)), colour='green', linetype = "longdash", size=0.2) +
geom_vline(xintercept=mean(het_rate)-(3*sd(het_rate)), colour='green', linetype = "longdash", size=0.2) +
geom_hline(yintercept=0.97, colour='gray', linetype = "longdash", size=0.2) #+
#geom_text(data=outliers_rates, aes(x=Het, y=Call, label=Name), hjust=0, vjust=+1.5, size=3)
hc_plot
outliers = outliers
Grey horizontal line: Calling Rate=0.97 threshold Green vertical line: ±3 SD deviation from the heterozygosity rate mean of the samples threshold 2 Outliers (will be excluded)
ibd_tab = ibd_tab[!(ibd_tab$ID1 %in% outliers | ibd_tab$ID2 %in% outliers),]
sexchk_tab = sexchk_tab[!sexchk_tab$ID %in% outliers,]
Can our genotype calls correctly predict the sex of our samples?
This is a QC step involving the calculation of the F inbreeding coefficient of the X chromosome. Under sex-linkage, inbreeding only causes an increase in the frequency of homozygous recessives in females. There is no meaning to the inbreeding coefficient of a male since he is hemizygous (i.e. he has only one X chromosome) and is, therefore, not affected by inbreeding. Check here for more information. Therefore, according to plink we are expecting:
F < 0.2 for females
F > 0.6 for males
#Get the F coefficient
sex_f = sexchk_tab$F
rownames(sexchk_tab)=paste0(sexchk_tab$FID,"-",sexchk_tab$IID)
#Create the plot
sex_f_color = ifelse(sexchk_tab$PEDSEX==sexchk_tab$SNPSEX, 'pink', 'grey')
barplot(sexchk_tab$F, col = sex_f_color, main='',
xlab='F inbreeding coefficient', ylab='No. of samples')
legend('bottomright', legend=c('True Females', 'Females predicted as Males'),
col=c('pink','gray'), lwd=10)
According to this histogram, all of our samples had an F coefficient less than 0.2 showing that our samples are reliably female samples.
We need to remove all samples we decided to exclude from the analysis:
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged.no_at-cg.no_inc.true_lipo –remove tempexcIDS.txt –make-bed –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd; rm temp; rm toexc;
1)Prune SNPs in relative LD and remove missing calls so we can correctly calculate Principal Components later
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd –indep-pairwise 50 5 0.5 –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd.ldprune
2) Remove the pruned variants from the dataset
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd –maf 0.01 –geno 0.05 –genome –extract Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd.ldprune.prune.in –make-bed -out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd.ldpruned
We will generate the PCA plot to check how the samples are clustered based on their genotype data. Any batch effects such as ethnicity will be revealed. We’ll be using GCTA software for this (check Dependencies.R script).
First, we need to convert our plink files to GCTA grm format
./resources/software/gcta_1.93.2beta/gcta64 –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd.ldpruned –make-grm –thread-num 10 –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd.ldpruned
Now we can use the grm files we created to perform the GCTA PC analysis. The output eigenvec and eigenval files are the ones we need for the PCA plot
./resources/software/gcta_1.93.2beta/gcta64 –grm Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd.ldpruned –pca 4 –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd.ldpruned
###PCA Plot on the LD pruned data-set
#Load the eigenvectors and eigenvalues to draw the plot
pcs = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.eigenvec")), header=FALSE, stringsAsFactors = FALSE)
pc_vals = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.eigenval")), header=FALSE, stringsAsFactors = FALSE)
pc_vals_tab = data.frame(PC_Eigenvalues=pc_vals[c(1:5),])
rownames(pc_vals_tab)=paste0('PC', c(1:nrow(pc_vals_tab)))
colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
pcs$group = NA
pcs[startsWith(pcs$FID,"B1_"),]$group = "Batch1"
pcs[startsWith(pcs$FID,"B2_"),]$group = "Batch2"
#PC1 vs PC2
pca1 = ggplot(pcs, aes(x=PC1, y=PC2, color=group)) +
geom_point(size=2, shape=21) +
labs(x="PC1", y = "PC2") +
theme_classic()
#PC2 vs PC3
pca2 = ggplot(pcs, aes(x=PC2, y=PC3, color=group)) +
geom_point(size=2, shape=21) +
labs(x="PC2", y = "PC3") +
theme_classic()
#PC1 vs PC3
pca3 = ggplot(pcs, aes(x=PC1, y=PC3, color=group)) +
geom_point(size=2, shape=21) +
labs(x="PC1", y = "PC3") +
theme_classic()
#Table with EigenValues and all the PCA plots
pc_vals_tab[c(1:5),] %>% kable(caption = 'Eigenvalues') %>% kable_styling()
| x |
|---|
| 1.26552 |
| 1.13110 |
| 1.09867 |
| 1.09413 |
| 1.09061 |
pca1
pca2
pca3
There are outliers in our PCA plots which might need to be excluded. To make more sense of it we’re going to merge our dataset with the HapMap dataset and create a new PCA which will show any ancestry outliers and their origin.
PCA analysis of GWAS & HapMap samples We expect all our samples to cluster together with the Central European samples.
1)Extract HapMap SNPs
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd –extract ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.no-at-cg-snps.txt –make-bed –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps
2)We will merge the HapMap and Batch2 datasets but multi-allelic SNPs will be created: If an allele is the major in one dataset and the minor in the other dataset, then plink marks them as multi-allelic SNPs which we will remove.
Initially merge the HapMap and Batch2 datasets:
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps –bmerge ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.founders.no-at-cg-snps_hg19_bad_snps_removed
Remove the multi-allelic SNPs
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps –exclude plink.missnp –make-bed –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2 –bmerge ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.founders.no-at-cg-snps_hg19_bad_snps_removed –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA
Prune SNPs in relative LD and remove missing calls so we can correctly calculate Principal Components later
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA –indep-pairwise 50 5 0.5 –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA
Remove the pruned variants from the dataset
./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA –maf 0.01 –geno 0.05 –extract Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA.prune.in –make-bed -out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA.pruned
Convert our plink files to GCTA grm format
./resources/software/gcta_1.93.2beta/gcta64 –bfile Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA.pruned –make-grm –autosome –thread-num 10 –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA.pruned
Now we can use the grm files we created to perform the GCTA PC analysis. The output eigenvec and eigenval files are the ones we need for the PCA plot
./resources/software/gcta_1.93.2beta/gcta64 –grm Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA.pruned –pca 4 –out Merged/Lipoedema_Batches_final/Qc/b1b2merged.no_at-cg.no_inc.true_lipo.cal.het.ibd_hapmap-snps2_for_PCA.pruned
#Load the eigenvectors and eigenvalues to draw the plot
pcs = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd_hapmap-snps2_for_PCA.pruned.eigenvec")), header=FALSE, stringsAsFactors = FALSE)
pc_vals = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd_hapmap-snps2_for_PCA.pruned.eigenval")), header=FALSE, stringsAsFactors = FALSE)
colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
pcs$group = NA
pcs[startsWith(pcs$FID,"B1_"),]$group = "Lipoedema_Batch1"
pcs[startsWith(pcs$FID,"B2_"),]$group = "Lipoedema_Batch2"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"1"),]$group = "CEU"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"NA"),]$group = "CHB"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"Y"),]$group = 'YRI'
pcs$group = factor(pcs$group)
pc_vals_tab = data.frame(PC_Eigenvalues=pc_vals[c(1:5),])
rownames(pc_vals_tab)=paste0('PC', c(1:nrow(pc_vals_tab)))
#pc_vals = read.table(paste0(plink_dir,out_suffix,'_hapmap-snps2_for_PCA_pruned.eigenval'), header=FALSE, stringsAsFactors = FALSE)
#colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
#colnames(pc_vals)=c('EigenValues')
#PC1 vs PC2
#pcs = pcs[pcs$group %in% c("Lipoedema_Batch1","Lipoedema_Batch2"),]
pca1 = ggplot(pcs, aes(x=PC1, y=PC2, color=group, label=paste0(FID,"-",IID))) +
geom_point(size=2, shape=21) +
labs(x="PC1", y = "PC2") + geom_text(aes(label=ifelse((abs(PC1)>4*IQR(PC1)|abs(PC2)>4*IQR(PC2)),IID,"")), vjust=-0.6) +
theme_classic()
pca1
pc_vals_tab[c(1:5),] %>% kable(caption = 'Eigenvalues') %>% kable_styling()
| x |
|---|
| 41.85960 |
| 35.75280 |
| 1.86259 |
| 1.51920 |
| 1.36796 |
gg = ggplotly(pca1)
gg
## 5 PCA Outliers will be excluded.
These are outliers around the European population and therefeore we excluded them.
Save the ids that need to be removed:
We need to remove all the samples and variants we decided to exclude from the analysis:
cat Merged/Lipoedema_Batches_final/Qc/incIDS.txt > Merged/Lipoedema_Batches_final/Qc/incatcgIDS.txt; cat Merged/Lipoedema_Batches_final/Qc/atcgIDS.txt >> Merged/Lipoedema_Batches_final/Qc/incatcgIDS.txt; ./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/b1b2merged –exclude Merged/Lipoedema_Batches_final/Qc/incatcgIDS.txt –remove Merged/Lipoedema_Batches_final/Qc/removeidsfinal.txt –maf 0.01 –geno 0.05 –hwe 1e-6 midp –make-bed –out Merged/Lipoedema_Batches_final/lipoedema_clean
5849 female samples enrolled in the Understanding Society UK study (Understanding Society UK) and genotyped using HumanCoreExome-12_v1.0 were used as controls (European Genome-phenome Archive ID: EGAD00010000890).
#Here we need to specify some locations for the analysis
sample_basename = "Input/USociety_final/USociety"
dir = dirname(sample_basename)
qcdir = file.path(dir, "Qc/")
finalname = "USociety_clean"
dir.create(dir)
## Warning in dir.create(dir): 'Input\USociety_final' already exists
dir.create(qcdir)
## Warning in dir.create(qcdir): 'Input\USociety_final\Qc' already exists
Get the names right
fam = paste0(sample_basename,".fam")
famtab = data.frame(fread(fam),stringsAsFactors = F)
newfam = data.frame(oldFID = famtab$V1, oldIID = famtab$V2, FID = paste0("USO",sprintf("%06d", c(1:nrow(famtab)))), IID = paste0("USO",sprintf("%06d", c(1:nrow(famtab))),1), PHENO = 1, stringsAsFactors = F)
#write.table(newfam, paste0(dir,"/USociety_IDS.txt"), quote = F, row.names = F, col.names = F)
awk ‘{print $1,$2,$3,$4}’ Input/USociety_final/USociety_IDS.txt > tmpIDS.txt; ./resources/software/plink1.9/plink –bfile Input/USociety_final/USociety –update-ids tmpIDS.txt –make-bed –out TEMP1; awk ‘{print $3,$4,$5}’ Input/USociety_final/USociety_IDS.txt > tmpPHENO.txt; ./resources/software/plink1.9/plink –bfile TEMP1 –pheno tmpPHENO.txt –make-bed –out Input/USociety_final/USociety_updated_all; rm temp* TEMP*
Here we need to specify some locations for the analysis again
sample_basename = "Input/USociety_final/USociety_updated_all"
dir = dirname(sample_basename)
qcdir = file.path(dir, "Qc/")
finalname = "USociety_clean"
Run PLINK Linux commands to:
1)Generate the missing rate/calling rate, heterozygocity and sex checking files
./resources/software/plink1.9/plink –bfile Input/USociety_final/USociety_updated_all –missing –het –check-sex –out Input/USociety_final/Qc/USociety_updated_all
2)Prune SNPs in relative LD so we can correctly calculate IBD
./resources/software/plink1.9/plink –bfile Input/USociety_final/USociety_updated_all –indep-pairwise 50 5 0.5 –out Input/USociety_final/Qc/USociety_updated_all
3)IBD on pruned variants
./resources/software/plink1.9/plink –bfile Input/USociety_final/USociety_updated_all –maf 0.05 –genome –out Input/USociety_final/Qc/USociety_updated_all –extract Input/USociety_final/Qc/USociety_updated_all.prune.in
Import all the files that plink generated into R. Then we can plot and do some statistical analyses on them.
#Define the file names
imiss = file.path(qcdir,paste0(basename(sample_basename),'.imiss'))
het = file.path(qcdir,paste0(basename(sample_basename),'.het'))
sexcheck = file.path(qcdir,paste0(basename(sample_basename),'.sexcheck'))
ibd = file.path(qcdir,paste0(basename(sample_basename),'.genome'))
# Stop the script if the data file cannot be found in the current working directory:
stopifnot(file.exists(imiss))
stopifnot(file.exists(het))
stopifnot(file.exists(sexcheck))
stopifnot(file.exists(ibd))
#Load the actual data into R
miss_tab <- read.table(file=imiss, header=T, stringsAsFactors = FALSE)
het_tab <- read.table(file=het, header=T, stringsAsFactors = FALSE)
sexchk_tab <- read.table(file=sexcheck, header=T, stringsAsFactors = FALSE)
ibd_tab <- data.frame(fread(ibd, header=T), stringsAsFactors = FALSE)
Autosomal Heterozygocity = (Number of autosomal genotypes - Observed number of homozygotes)/Number of autosomal genotypes
Calling Rate = 1-(Number of missing genotype calls/ Number of valid calls)
Outliers: Samples with Heterozygocity < mean-3SD heterozygocity calling rate
Samples with Heterozygocity > mean+3SD heterozygocity calling rate
Samples with Calling rate < 0.97
#Calculate the heterozygocity rate
het_rate = (het_tab$N.NM. - het_tab$O.HOM.)/het_tab$N.NM.
#Calculate the genotype calling rate
cal_rate = 1-miss_tab$F_MISS
names(het_rate) = het_tab$IID
names(cal_rate) = miss_tab$IID
#Name the outliers
outliers = unique(c(names(cal_rate[cal_rate < 0.97]), names(het_rate[het_rate > mean(het_rate)+(3*sd(het_rate)) | het_rate < mean(het_rate)-(3*sd(het_rate))])))
outliers_rates = data.frame(Name=outliers, Call=cal_rate[outliers], Het=het_rate[outliers])
bline_outliers = unique(c(names(cal_rate[cal_rate < 0.975]), names(het_rate[het_rate > mean(het_rate)+(2*sd(het_rate)) | het_rate < mean(het_rate)-(2*sd(het_rate))])))
bline_outliers_rates = data.frame(Name=bline_outliers, Call=cal_rate[bline_outliers], Het=het_rate[bline_outliers])
#Draw the heterozygocity to calling rate plot
hc_plot = ggplot(miss_tab, aes(x=het_rate, y=cal_rate, label=IID)) +
geom_point(size=2, shape=21) +
labs(x="Autosomal Heterozygocity Rate", y = "Calling Rate") +
theme_classic() +
geom_vline(xintercept=0.2275054, colour='green', linetype = "longdash", size=0.2) +
geom_vline(xintercept=0.1960522, colour='green', linetype = "longdash", size=0.2) +
geom_hline(yintercept=0.97, colour='gray', linetype = "longdash", size=0.2)
outliers = outliers
hc_plot
Grey horizontal line: Calling Rate=0.97 threshold Green vertical line: ±3 SD deviation from the heterozygosity rate mean of the samples threshold 77 Outliers (will be excluded)
ibd_tab = ibd_tab[!(ibd_tab$IID1 %in% outliers | ibd_tab$IID2 %in% outliers),]
sexchk_tab = sexchk_tab[!sexchk_tab$IID %in% outliers,]
snpsex = sexchk_tab$SNPSEX
names(snpsex) = sexchk_tab$IID
We need to remove all samples we decided to exclude from the analysis:
./resources/software/plink1.9/plink –bfile Input/USociety_final/USociety_updated_all –remove tempexcIDS.txt –filter-females –make-bed –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd; rm temp; rm toexc;
1)Prune SNPs in relative LD and remove missing calls so we can correctly calculate Principal Components later
./resources/software/plink1.9/plink –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd –indep-pairwise 50 5 0.5 –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd.ldprune
2) Remove the pruned variants from the dataset
./resources/software/plink1.9/plink –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd –maf 0.01 –geno 0.05 –genome –extract Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd.ldprune.prune.in –make-bed -out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd.ldpruned
We will generate the PCA plot to check how the samples are clustered based on their genotype data. Any batch effects such as ethnicity will be revealed. We’ll be using GCTA software for this (check Dependencies.R script).
First, we need to convert our plink files to GCTA grm format
./resources/software/gcta_1.93.2beta/gcta64 –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd.ldpruned –make-grm –thread-num 10 –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd.ldpruned
Now we can use the grm files we created to perform the GCTA PC analysis. The output eigenvec and eigenval files are the ones we need for the PCA plot
./resources/software/gcta_1.93.2beta/gcta64 –grm Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd.ldpruned –pca 4 –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd.ldpruned
###PCA Plot on the LD pruned data-set
#Load the eigenvectors and eigenvalues to draw the plot
pcs = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.eigenvec")), header=FALSE, stringsAsFactors = FALSE)
pc_vals = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.eigenval")), header=FALSE, stringsAsFactors = FALSE)
pc_vals_tab = data.frame(PC_Eigenvalues=pc_vals[c(1:5),])
rownames(pc_vals_tab)=paste0('PC', c(1:nrow(pc_vals_tab)))
colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
pcs$group = NA
pcs[startsWith(pcs$FID,"U"),]$group = "USociety"
#PC1 vs PC2
pca1 = ggplot(pcs, aes(x=PC1, y=PC2, color=group)) +
geom_point(size=2, shape=21) +
labs(x="PC1", y = "PC2") + #geom_text(aes(label=ifelse((abs(PC1)>4*IQR(PC1)|abs(PC2)>4*IQR(PC2)),IID,"")), vjust=-0.6) +
theme_classic()
#PC2 vs PC3
pca2 = ggplot(pcs, aes(x=PC2, y=PC3, color=group)) +
geom_point(size=2, shape=21) +
labs(x="PC2", y = "PC3") + #geom_text(aes(label=ifelse((abs(PC2)>4*IQR(PC2)|abs(PC3)>4*IQR(PC3)),IID,"")), vjust=-0.6) +
theme_classic()
#PC1 vs PC3
pca3 = ggplot(pcs, aes(x=PC1, y=PC3, color=group)) +
geom_point(size=2, shape=21) +
labs(x="PC1", y = "PC3") + #geom_text(aes(label=ifelse((abs(PC1)>4*IQR(PC1)|abs(PC3)>4*IQR(PC3)),IID,"")), vjust=-0.6) +
theme_classic()
#Table with EigenValues and all the PCA plots
pc_vals_tab[c(1:5),] %>% kable(caption = 'Eigenvalues') %>% kable_styling()
| x |
|---|
| 5.18538 |
| 3.47582 |
| 3.04946 |
| 2.67612 |
| 2.58114 |
pca1
pca2
pca3
PCA analysis of GWAS & HapMap samples We want to only select the samples which cluster together with the CEU hapmap population. 1)Extract HapMap SNPs
./resources/software/plink1.9/plink –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd –extract ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.no-at-cg-snps.txt –make-bed –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps
2)We will merge the HapMap and Batch2 datasets but multi-allelic SNPs will be created: If an allele is the major in one dataset and the minor in the other dataset, then plink marks them as multi-allelic SNPs which we will remove.
Initially merge the HapMap and Batch2 datasets:
./resources/software/plink1.9/plink –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps –bmerge ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.founders.no-at-cg-snps_hg19_bad_snps_removed
Remove the multi-allelic SNPs.
./resources/software/plink1.9/plink –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps –exclude plink.missnp –make-bed –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2
./resources/software/plink1.9/plink –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2 –bmerge ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.founders.no-at-cg-snps_hg19_bad_snps_removed –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA
Prune SNPs in relative LD and remove missing calls so we can correctly calculate Principal Components later
./resources/software/plink1.9/plink –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA –indep-pairwise 50 5 0.5 –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA
Remove the pruned variants from the dataset
./resources/software/plink1.9/plink –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA –maf 0.01 –geno 0.05 –extract Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA.prune.in –make-bed -out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA.pruned
Convert our plink files to GCTA grm format
./resources/software/gcta_1.93.2beta/gcta64 –bfile Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA.pruned –make-grm –autosome –thread-num 10 –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA.pruned
Now we can use the grm files we created to perform the GCTA PC analysis. The output eigenvec and eigenval files are the ones we need for the PCA plot
./resources/software/gcta_1.93.2beta/gcta64 –grm Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA.pruned –pca 4 –out Input/USociety_final/Qc/USociety_updated_all.cal.het.ibd_hapmap-snps2_for_PCA.pruned
#Load the eigenvectors and eigenvalues to draw the plot
pcs = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd_hapmap-snps2_for_PCA.pruned.eigenvec")), header=FALSE, stringsAsFactors = FALSE)
pc_vals = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd_hapmap-snps2_for_PCA.pruned.eigenval")), header=FALSE, stringsAsFactors = FALSE)
colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
pcs$group = NA
pcs[startsWith(pcs$FID,"U"),]$group = "USociety"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"1"),]$group = "CEU"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"NA"),]$group = "CHB"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"Y"),]$group = 'YRI'
pcs$group = factor(pcs$group)
pc_vals_tab = data.frame(PC_Eigenvalues=pc_vals[c(1:5),])
rownames(pc_vals_tab)=paste0('PC', c(1:nrow(pc_vals_tab)))
#pc_vals = read.table(paste0(plink_dir,out_suffix,'_hapmap-snps2_for_PCA_pruned.eigenval'), header=FALSE, stringsAsFactors = FALSE)
#colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
#colnames(pc_vals)=c('EigenValues')
#PC1 vs PC2
#pcs = pcs[pcs$group %in% c("Lipoedema_Batch1","Lipoedema_Batch2"),]
pca1 = ggplot(pcs, aes(x=PC1, y=PC2, color=group, label=paste0(FID,"-",IID))) +
geom_point(size=2, shape=21) +
labs(x="PC1", y = "PC2") + #geom_text(aes(label=ifelse((abs(PC1)>4*IQR(PC1)|abs(PC2)>4*IQR(PC2)),IID,"")), vjust=-0.6) +
theme_classic()
pca1
pc_vals_tab[c(1:5),] %>% kable(caption = 'Eigenvalues') %>% kable_styling()
| x |
|---|
| 91.45710 |
| 38.67590 |
| 3.37738 |
| 3.27517 |
| 2.83262 |
gg = ggplotly(pca1)
gg
These are outliers around the European population and therefeore we excluded them.
PCA After exclusion:
pcs = pcs[!pcs$IID %in% pca_outs[,2],]
#PC1 vs PC2
#pcs = pcs[pcs$group %in% c("Lipoedema_Batch1","Lipoedema_Batch2"),]
pca1 = ggplot(pcs, aes(x=PC1, y=PC2, color=group, label=paste0(FID,"-",IID))) +
geom_point(size=2, shape=21) +
labs(x="PC1", y = "PC2") + #geom_text(aes(label=ifelse((abs(PC1)>4*IQR(PC1)|abs(PC2)>4*IQR(PC2)),IID,"")), vjust=-0.6) +
theme_classic()
pca1
pc_vals_tab[c(1:5),] %>% kable(caption = 'Eigenvalues') %>% kable_styling()
| x |
|---|
| 91.45710 |
| 38.67590 |
| 3.37738 |
| 3.27517 |
| 2.83262 |
gg = ggplotly(pca1)
Add all under-exclusion samples together:
final_exclude = rbind(plink_exclude,pca_outs)
#write.table(final_exclude, 'tempexcIDS.txt', sep='\t', row.names = FALSE, quote=FALSE)
We need to remove all four samples we decided to exclude from the analysis:
./resources/software/plink1.9/plink –bfile Input/USociety_final/USociety_updated_all –remove tempexcIDS.txt –maf 0.01 –geno 0.05 –filter-females –hwe 1e-6 midp –make-bed –out Input/USociety_final/USociety_clean; rm temp*
Merge lipoedema cases and controls genotyping data and perform the association analysis.
We need to specify the locations of the ouput directory (Batch directory) and the locations of the two plink filesets we are going to merge
##Where are we going to run our analysis?
b1 = "Merged/Lipoedema_Batches_final/lipoedema_clean"
b2 = "Input/USociety_final/USociety_clean"
sample_basename = "Merged/USociety_vs_Lipoedema_final/lipoUSociety_merged.no_at-cg.no_inc"
dir = dirname(sample_basename)
qcdir = file.path(dir, "Qc/")
analysis_dir = file.path(dir, "Association_analysis")
meta_dir = file.path(dir, "Meta-analysis")
finalname = "lipoUSo_clean"
masterfile = "./Input/Sample_info/Glens recruits-merged_v3.csv"
lmaster = read.csv(masterfile, stringsAsFactors = F)
diagnoses <- lmaster$DIAGNOSIS.CONF.
names(diagnoses) <- lmaster$D.Number
dir.create(dir, showWarnings = FALSE)
dir.create(qcdir, showWarnings = FALSE)
dir.create(analysis_dir, showWarnings = FALSE)
dir.create(meta_dir)
## Warning in dir.create(meta_dir): 'Merged\USociety_vs_Lipoedema_final\Meta-
## analysis' already exists
b1_bim = data.frame(fread(paste0(b1,".bim")), stringsAsFactors = FALSE)
b2_bim = data.frame(fread(paste0(b2,".bim")), stringsAsFactors = FALSE)
b1_fam = data.frame(fread(paste0(b1,".fam")), stringsAsFactors = FALSE)
b2_fam = data.frame(fread(paste0(b2,".fam")), stringsAsFactors = FALSE)
rownames(b1_bim)=b1_bim$V2
rownames(b2_bim)=b2_bim$V2
#Add a unique genomic position identifier (pid) with format [no of chromosome]chr[genomic coordinate]
b1_bim$pid = paste0(b1_bim$V1,'chr',b1_bim$V4)
b2_bim$pid = paste0(b2_bim$V1,'chr',b2_bim$V4)
We need to only keep the common variants between the two batches. Variants not covered in both filesets will generate unwanted biases and batch effects.
Keep in Batch1 only the variants which are shared between Batch2 and Batch1
less Input/USociety_final/USociety_clean.bim | awk ‘{print $2}’ > tempIDS.txt; ./resources/software/plink1.9/plink –bfile Merged/Lipoedema_Batches_final/lipoedema_clean –extract tempIDS.txt –make-bed –out Merged/USociety_vs_Lipoedema_final/b1.shared
Keep in Batch2 only the variants which are shared between Batch2 and Batch1
less Merged/Lipoedema_Batches_final/lipoedema_clean.bim | awk ‘{print $2}’ > tempIDS.txt; ./resources/software/plink1.9/plink –bfile Input/USociety_final/USociety_clean –filter-females –extract tempIDS.txt –make-bed –out Merged/USociety_vs_Lipoedema_final/b2.shared
1)Merge Batch2 and Filtered Batch1 (This step will fail due to strand flipping issues but it will create the usefull missnp file which will be used for strand flipping.)
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/b2.shared –bmerge Merged/USociety_vs_Lipoedema_final/b1.shared
2)Use the missnp file to filter flip the problematic snps in batch1 and then redo the merging.
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/b1.shared –flip plink.missnp –make-bed –out Merged/USociety_vs_Lipoedema_final/b1.shared.flipped
Now we can actually merge batch2 with the flipped version of batch1 3)Merge the flipped batch1 with the batch2 data-set
In this step WE ALSO REMOVE NON ACGT SNPS from the merged dataset
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/b2.shared –bmerge Merged/USociety_vs_Lipoedema_final/b1.shared.flipped –merge-mode 1 –snps-only just-acgt –make-bed –out Merged/USociety_vs_Lipoedema_final/b1b2merged
Merge is now completed.
T -> A, A -> T, G -> C, C -> G SNPs are particularly hard to resolve when merging datasets. In this case we are going to remove them from the merged dataset.
cat(paste0(length(unique(at_cg))," T -> A, A -> T, G -> C, C -> G SNPs are being removed:\n",paste(unique(c(at_cg)),collapse=", ")))
## 0 T -> A, A -> T, G -> C, C -> G SNPs are being removed:
Use plink to remove these SNPs
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/b1b2merged –allow-no-sex –exclude tempIDS.txt –make-bed –out Merged/USociety_vs_Lipoedema_final/lipoUSociety_merged.no_at-cg.no_inc
1)Generate the missing rate/calling rate, heterozygocity and sex checking files
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/lipoUSociety_merged.no_at-cg.no_inc –missing –het –check-sex –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc
2)Prune SNPs in relative LD so we can correctly calculate IBD
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/lipoUSociety_merged.no_at-cg.no_inc –indep-pairwise 50 5 0.5 –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc
3)IBD on pruned variants
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/lipoUSociety_merged.no_at-cg.no_inc –maf 0.05 –genome –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc –extract Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.prune.in
Import all the files that plink generated into R. Then we can plot and do some statistical analyses on them.
#Define the file names
imiss = file.path(qcdir,paste0(basename(sample_basename),'.imiss'))
het = file.path(qcdir,paste0(basename(sample_basename),'.het'))
sexcheck = file.path(qcdir,paste0(basename(sample_basename),'.sexcheck'))
ibd = file.path(qcdir,paste0(basename(sample_basename),'.genome'))
# Stop the script if the data file cannot be found in the current working directory:
stopifnot(file.exists(imiss))
stopifnot(file.exists(het))
stopifnot(file.exists(sexcheck))
stopifnot(file.exists(ibd))
#Load the actual data into R
miss_tab <- read.table(file=imiss, header=T, stringsAsFactors = FALSE)
het_tab <- read.table(file=het, header=T, stringsAsFactors = FALSE)
sexchk_tab <- read.table(file=sexcheck, header=T, stringsAsFactors = FALSE)
ibd_tab <- data.frame(fread(file=ibd, header=T), stringsAsFactors = FALSE)
Autosomal Heterozygocity = (Number of autosomal genotypes - Observed number of homozygotes)/Number of autosomal genotypes
Calling Rate = 1-(Number of missing genotype calls/ Number of valid calls)
Outliers: Samples with Heterozygocity < mean-3SD heterozygocity calling rate
Samples with Heterozygocity > mean+3SD heterozygocity calling rate
Samples with Calling rate < 0.97
#Calculate the heterozygocity rate
het_rate = (het_tab$N.NM. - het_tab$O.HOM.)/het_tab$N.NM.
#Calculate the genotype calling rate
cal_rate = 1-miss_tab$F_MISS
names(het_rate) = het_tab$IID
names(cal_rate) = miss_tab$IID
#Name the outliers
outliers = unique(c(names(cal_rate[cal_rate < 0.97]), names(het_rate[het_rate > mean(het_rate)+(3*sd(het_rate)) | het_rate < mean(het_rate)-(3*sd(het_rate))])))
outliers_rates = data.frame(Name=outliers, Call=cal_rate[outliers], Het=het_rate[outliers])
bline_outliers = unique(c(names(cal_rate[cal_rate < 0.975]), names(het_rate[het_rate > mean(het_rate)+(2*sd(het_rate)) | het_rate < mean(het_rate)-(2*sd(het_rate))])))
bline_outliers_rates = data.frame(Name=bline_outliers, Call=cal_rate[bline_outliers], Het=het_rate[bline_outliers])
#Draw the heterozygocity to calling rate plot
hc_plot = ggplot(miss_tab, aes(x=het_rate, y=cal_rate)) +
geom_point(size=2, shape=21) +
labs(x="Autosomal Heterozygocity Rate", y = "Calling Rate") +
theme_classic() +
geom_vline(xintercept=mean(het_rate)-(2*sd(het_rate)), colour='blue', linetype = "longdash", size=0.2) +
geom_vline(xintercept=mean(het_rate)+(2*sd(het_rate)), colour='blue', linetype = "longdash", size=0.2)+
geom_hline(yintercept=0.97, colour='gray', linetype = "longdash", size=0.2) #+
#geom_text(data=outliers_rates, aes(x=Het, y=Call, label=Name), hjust=0, vjust=+1.5, size=3)
outliers = outliers
hc_plot
## Grey horizontal line: Calling Rate=0.97 threshold
## Green vertical line: ±3 SD deviation from the heterozygosity rate mean of the samples threshold
## 34 outliers will be excluded:
## 34 are Controls and 0 are Cases.
plink_exclude1 = miss_tab[miss_tab$IID %in% outliers,c('FID', 'IID')]
het_tab = het_tab[!(het_tab$IID %in% outliers),]
miss_tab = miss_tab[!(miss_tab$IID %in% outliers),]
ibd_tab = ibd_tab[!(ibd_tab$IID1 %in% outliers | ibd_tab$IID2 %in% outliers),]
sexchk_tab = sexchk_tab[!sexchk_tab$IID %in% outliers,]
Create a file of these IIDs and their FIDs which will be excluded. This file will be used with the –remove plink command.
if(length(outliers)!=nrow(plink_exclude1)){stop()}
#write.table(plink_exclude1, 'tempexcIDS.txt', sep='\t', row.names = FALSE, quote=FALSE, col.names = F)
#write.table(plink_exclude1, file.path(qcdir,'toremovesamples.txt'), sep='\t', row.names = FALSE, quote=FALSE, col.names = F)
Sex assignments have been checked before.
We have removed all the relateds Lipoedema and Control USociety samples so we don’t expect any more relateds in the cohort.
Let’s check the IBD summary statistics (PI_HAT < 0.05).
summary(ibd_tab$PI_HAT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.003466 0.006100 0.048200
Maximum PI_HAT is < 0.05 so there are no related people in the data-set.
We need to remove all samples we decided to exclude from the analysis:
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/lipoUSociety_merged.no_at-cg.no_inc –remove Merged/USociety_vs_Lipoedema_final/Qc/toremovesamples.txt –make-bed –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd; rm temp; rm toexc;
1)Prune SNPs in relative LD and remove missing calls so we can correctly calculate Principal Components later
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd –indep-pairwise 50 5 0.5 –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd.ldprune
2) Remove the pruned variants from the dataset
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd –maf 0.01 –geno 0.05 –extract Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd.ldprune.prune.in –make-bed -out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd.ldpruned
We will generate the PCA plot to check how the samples are clustered based on their genotype data. Any batch effects such as ethnicity will be revealed. We’ll be using GCTA software for this (check Dependencies.R script).
First, we need to convert our plink files to GCTA grm format
./resources/software/gcta_1.93.2beta/gcta64 –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd.ldpruned –make-grm –thread-num 10 –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd.ldpruned
Now we can use the grm files we created to perform the GCTA PC analysis. The output eigenvec and eigenval files are the ones we need for the PCA plot
./resources/software/gcta_1.93.2beta/gcta64 –grm Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd.ldpruned –pca 8 –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd.ldpruned
#Load the eigenvectors and eigenvalues to draw the plot
pcs = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.eigenvec")), header=FALSE, stringsAsFactors = FALSE)
#write.table(pcs[,c(1:4)],file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.2pcs.eigenvec")), quote=F, col.names = F, row.names = F)
#write.table(pcs[,c(1:5)],file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.3pcs.eigenvec")), quote=F, col.names = F, row.names = F)
#write.table(pcs[,c(1:6)],file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.4pcs.eigenvec")), quote=F, col.names = F, row.names = F)
pc_vals = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd.ldpruned.eigenval")), header=FALSE, stringsAsFactors = FALSE)
pc_vals_tab = data.frame(PC_Eigenvalues=pc_vals[c(1:5),])
rownames(pc_vals_tab)=paste0('PC', c(1:nrow(pc_vals_tab)))
colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
pcs$group = NA
pcs[startsWith(pcs$FID,"B1_"),]$group = "Lipoedema_B1"
pcs[startsWith(pcs$FID,"B2_"),]$group = "Lipoedema_B2"
pcs[startsWith(pcs$FID,'USO'),]$group = "USociety"
#PC1 vs PC2
pca1 = ggplot(pcs, aes(x=PC1, y=PC2, color=group)) +
geom_point(size=2, shape=21) + geom_vline(xintercept = -0.1) +
labs(x="PC1", y = "PC2") + #geom_text(aes(label=ifelse((abs(PC1)>4*IQR(PC1)|abs(PC2)>4*IQR(PC2)),IID,"")), vjust=-0.6) +
theme_classic()
#PC2 vs PC3
pca2 = ggplot(pcs, aes(x=PC2, y=PC3, color=group)) +
geom_point(size=2, shape=21) +
labs(x="PC2", y = "PC3") + #geom_text(aes(label=ifelse((abs(PC2)>4*IQR(PC2)|abs(PC3)>4*IQR(PC3)),IID,"")), vjust=-0.6) +
theme_classic()
#PC1 vs PC3
pca3 = ggplot(pcs, aes(x=PC1, y=PC3, color=group)) +
geom_point(size=2, shape=21) +
labs(x="PC1", y = "PC3") + #geom_text(aes(label=ifelse((abs(PC1)>4*IQR(PC1)|abs(PC3)>4*IQR(PC3)),IID,"")), vjust=-0.6) +
theme_classic()
#Table with EigenValues and all the PCA plots
#pc_vals_tab[c(1:5),] %>% kable(caption = 'Eigenvalues') %>% kable_styling()
pca1
pca2
pca3
pcouts1 = pcs[pcs$PC1 < (-0.11),]$IID
Since we have excluded all PCA outliers in batch-specific analyeses, as expected we have no outliers in our PCA plots which need to be excluded. However we are going to run the PCA analysis again using the HapMap samples as population references to see how the analysis’ samples cluster with the hapmap populations.
We expect all our samples to cluster together with the Central European samples.
1)Extract HapMap SNPs
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd –extract ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.no-at-cg-snps.txt –make-bed –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps
2)We will merge the HapMap and Batch2 datasets but multi-allelic SNPs will be created: If an allele is the major in one dataset and the minor in the other dataset, then plink marks them as multi-allelic SNPs which we will remove. -Initially merge the HapMap and Batch2 datasets:
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps –bmerge ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.founders.no-at-cg-snps_hg19_bad_snps_removed
Remove the multi-allelic SNPs
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps –exclude plink.missnp –make-bed –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2 –bmerge ./resources/hapmap/hapmap3r2_CEU.CHB.JPT.YRI.founders.no-at-cg-snps_hg19_bad_snps_removed –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA
Prune SNPs in relative LD and remove missing calls so we can correctly calculate Principal Components later
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA –indep-pairwise 50 5 0.5 –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA
Remove the pruned variants from the dataset
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA –maf 0.01 –geno 0.05 –extract Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA.prune.in –make-bed -out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA.pruned
Convert our plink files to GCTA grm format
./resources/software/gcta_1.93.2beta/gcta64 –bfile Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA.pruned –make-grm –autosome –thread-num 10 –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA.pruned
Now we can use the grm files we created to perform the GCTA PC analysis. The output eigenvec and eigenval files are the ones we need for the PCA plot
./resources/software/gcta_1.93.2beta/gcta64 –grm Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA.pruned –pca 4 –out Merged/USociety_vs_Lipoedema_final/Qc/lipoUSociety_merged.no_at-cg.no_inc.cal.het.ibd_hapmap-snps2_for_PCA.pruned
#Load the eigenvectors and eigenvalues to draw the plot
pcs = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd_hapmap-snps2_for_PCA.pruned.eigenvec")), header=FALSE, stringsAsFactors = FALSE)
pc_vals = read.table(file.path(qcdir,paste0(basename(sample_basename),".cal.het.ibd_hapmap-snps2_for_PCA.pruned.eigenval")), header=FALSE, stringsAsFactors = FALSE)
colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
pcs$group = NA
pcs[startsWith(pcs$FID,"B1_"),]$group = "Lipoedema"
pcs[startsWith(pcs$FID,"B2_"),]$group = "Lipoedema"
pcs[startsWith(pcs$FID,"USO"),]$group = "Control"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"1"),]$group = "CEU"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"NA"),]$group = "CHB"
pcs[startsWith(pcs$IID,"NA") & startsWith(pcs$FID,"Y"),]$group = 'YRI'
#
pcs2 = rbind(pcs[pcs$group=="Control",],pcs[pcs$group=="Lipoedema",],pcs[pcs$group=="CEU",],pcs[pcs$group=="CHB",],pcs[pcs$group=="YRI",])
pc_vals_tab = data.frame(PC_Eigenvalues=pc_vals[c(1:5),])
rownames(pc_vals_tab)=paste0('PC', c(1:nrow(pc_vals_tab)))
pcs2$alpha = 0.2
pcs2[pcs2$group=="Lipoedema",]$alpha = 0.5
pcs2[pcs2$group=="Control",]$alpha = 0.3
pcs2[pcs2$group=="YRI",]$alpha = 0.3
pcs2[pcs2$group=="CHB",]$alpha = 0.3
pcs2$group = factor(pcs2$group, levels = c("Control","Lipoedema","CEU","CHB","YRI"))
#pc_vals = read.table(paste0(plink_dir,out_suffix,'_hapmap-snps2_for_PCA_pruned.eigenval'), header=FALSE, stringsAsFactors = FALSE)
#colnames(pcs)=c('FID','IID',paste0('PC', c(1:(ncol(pcs)-2))))
#colnames(pc_vals)=c('EigenValues')
#PC1 vs PC2
#pcs = pcs[pcs$group %in% c("Lipoedema_Batch1","Lipoedema_Batch2"),]
pca1 = ggplot(pcs2, aes(x=PC1, y=PC2, color=group, label=IID)) +
geom_point(size=5, shape=19, aes(alpha=alpha)) +
labs(x=paste0("PC1: ",round(pc_vals$V1[1],1),"% Variance"), y = paste0("PC2: ",round(pc_vals$V1[2],1),"% Variance")) + #geom_text(aes(label=ifelse((abs(PC1)>4*IQR(PC1)|abs(PC2)>4*IQR(PC2)),IID,"")), vjust=-0.6) +
scale_colour_manual(values=c("#56B4E9","#E69F00","#FF0606","#009E73","#F0E442")) +
guides(fill=guide_legend(title="Cohort")) +
theme_bw()
pca1
#pc_vals_tab[c(1:5),] %>% kable(caption = 'Eigenvalues') %>% kable_styling()
gg = ggplotly(pca1)
#gg
#tiff(paste0(qcdir,basename(sample_basename),".cal.het.ibd_hapmap-snps2_for_PCA.pruned.pca_plot2.tiff"),width=2600,height=1500, res = 600, pointsize = 12)
pca1
dev.off()
## null device
## 1
plink_exclude_all = plink_exclude1
#write.table(plink_exclude_all, file.path(qcdir,'toremovesamples_all_final.txt'), sep='\t', row.names = FALSE, quote=FALSE, col.names = F)
##Create the final clean set of files: We need to remove all the samples we decided to exclude from the analysis:
./resources/software/plink1.9/plink –bfile Merged/USociety_vs_Lipoedema_final/b1b2merged –geno 0.05 –maf 0.01 –hwe 1e-6 midp –exclude Merged/USociety_vs_Lipoedema_final/Qc/atcgIDS.txt –remove Merged/USociety_vs_Lipoedema_final/Qc/toremovesamples_all_final.txt –make-bed –out Merged/USociety_vs_Lipoedema_final/lipoUSo_clean
We need to specify the locations of the ouput directory (Batch directory) and the locations of the two plink filesets we are going to merge
##Where are we going to run our analysis?
sample_basename = "./Merged/USociety_vs_Lipoedema_final/lipoUSo_clean"
dir = dirname(sample_basename)
herdir = file.path(dir, "Heritability_estimation/")
qcdir = file.path(dir, paste0(herdir,"/Qc/"))
finalname = "lipoUSo_clean"
dir.create(dir, showWarnings = FALSE)
dir.create(qcdir, showWarnings = FALSE)
dir.create(herdir, showWarnings = FALSE)
Load the fam file of the plink filesets we want to perform REML on
ffam = data.frame(fread(paste0(sample_basename,".fam")), stringsAsFactors = FALSE)
gcta_pheno = cbind(ffam[,c("V1","V2")],ffam$V6-1)
write.table(gcta_pheno, paste0(herdir,"/gcta_phenos.txt"), sep="\t", row.names=F, col.names=F, quote=F)
1)First, we need to convert our plink files to GCTA grm format
./resources/software/gcta_1.93.2beta/gcta64 –bfile ./Merged/USociety_vs_Lipoedema_final/lipoUSo_clean –make-grm –thread-num 10 –out ./Merged/USociety_vs_Lipoedema_final/Heritability_estimation/lipoUSo_clean
- Using prevalence=0.05
./resources/software/gcta_1.93.2beta/gcta64 –grm ./Merged/USociety_vs_Lipoedema_final/Heritability_estimation/lipoUSo_clean –pheno ./Merged/USociety_vs_Lipoedema_final/Heritability_estimation/gcta_phenos.txt –reml –prevalence 0.05 –out ./Merged/USociety_vs_Lipoedema_final/Heritability_estimation/lipoUSo_clean.prev005
- Using prevalence=0.1
./resources/software/gcta_1.93.2beta/gcta64 –grm ./Merged/USociety_vs_Lipoedema_final/Heritability_estimation/lipoUSo_clean –pheno ./Merged/USociety_vs_Lipoedema_final/Heritability_estimation/gcta_phenos.txt –reml –prevalence 0.1 –out ./Merged/USociety_vs_Lipoedema_final/Heritability_estimation/lipoUSo_clean.prev01
results_annotation <- function(res_path_glm, res_path_19) {
path_prefix = tools::file_path_sans_ext(res_path_glm)
results2 <- data.frame(fread(res_path_glm, header=T), stringsAsFactors = FALSE)
results3 <- data.frame(fread(res_path_19, header=T), stringsAsFactors = FALSE)
results2 <- merge(results2, results3[,c("SNP","P","OR","SE","L95","U95")], by.x="ID", by.y="SNP", all.x=T, all.y =F,
suffixes = c("_glm","_assoc"))
if("MT" %in% results2$X.CHROM){
results2[results2$X.CHROM == "MT",]$X.CHROM = 25
}
if("X" %in% results2$X.CHROM){
results2[results2$X.CHROM == "X",]$X.CHROM = 23
}
if("Y" %in% results2$X.CHROM){
results2[results2$X.CHROM == "Y",]$X.CHROM = 24
}
results2$X.CHROM = as.numeric(results2$X.CHROM)
results2$nexus = "dbsnp"
write.table(head(results2[order(results2$P_assoc),c("nexus","ID")],300), paste0(path_prefix,".for_nexus.txt"), sep="\t", quote=F,
col.names = F, row.names = F)
readline(prompt="Upload the for nexus file on nexus. save the annotation and then Press [enter] to continue")
results2_genemapping = read.table(paste0(path_prefix,".gene_mapping.txt"),stringsAsFactors = F,sep="\t",header=T)
results2_genemapping$Gene = ifelse(results2_genemapping$Overlapped.Gene=="None",paste0(results2_genemapping$Nearest.Upstream.Gene,";",results2_genemapping$Nearest.Downstream.Gene),results2_genemapping$Overlapped.Gene)
results2_top300 = merge(results2, results2_genemapping[,c("Variation.ID","Gene","Annotation")],by.x="ID",by.y="Variation.ID", all.x=F)
write.table(results2_top300, paste0(path_prefix,".top300annot.txt"), sep="\t", quote=F, col.names = F, row.names = F)
return(results2_top300)
}
make_plots_ready_plink2 <- function(res_path){
path_prefix = tools::file_path_sans_ext(res_path)
results2 <- data.frame(fread(res_path, header=T), stringsAsFactors = FALSE)
if("X.CHROM" %in% colnames(results2)){
results2 = results2[!results2$CHROM %in% c("MT","X"),]
results2$X.CHROM = as.numeric(results2$X.CHROM)
}
if("CHR" %in% colnames(results2)){
results2 = results2[!results2$CHR %in% c("MT","X"),]
results2$CHR = as.numeric(results2$CHR)
}
fresults=results2
return(fresults)
}
We are going to run both the simple logistic regression analysis on PLINK 1.9 which will provide the OR and the PValue we are counting on but we are also goint to run the PLINK 2.0 glm analysis without covariates to obtain some nice stats that only PLINK 2.0 outputs:
Run the Association Analysis
temp_comm = paste0(plink,' --bfile ',file.path(dir,finalname),' --logistic --ci 0.95 ',' --out ',
file.path(analysis_dir,finalname),"; \n",
plink2,' --bfile ',file.path(dir,finalname),
' --glm allow-no-covars cols=chrom,pos,ref,alt1,alt,a1countcc,totallelecc,gcountcc,nobs,beta,orbeta,tz,p',
' --out ',file.path(analysis_dir,finalname))
./resources/software/plink1.9/plink –bfile ./Merged/USociety_vs_Lipoedema_final/lipoUSo_clean –logistic –ci 0.95 –out Merged/USociety_vs_Lipoedema_final/Association_analysis/lipoUSo_clean; ./resources/software/plink2/plink2 –bfile ./Merged/USociety_vs_Lipoedema_final/lipoUSo_clean –glm allow-no-covars cols=chrom,pos,ref,alt1,alt,a1countcc,totallelecc,gcountcc,nobs,beta,orbeta,tz,p –out Merged/USociety_vs_Lipoedema_final/Association_analysis/lipoUSo_clean
Calculate genomic inflation
We’re gonna run the same plink –logistic association but also using –adjust so the lambda inflation factor will be calculated
temp_comm = paste0(plink,' --bfile ',file.path(dir,finalname),' --adjust --logistic --ci 0.95 ',' --out ',
file.path(analysis_dir,finalname),'_for_lambda')
./resources/software/plink1.9/plink –bfile ./Merged/USociety_vs_Lipoedema_final/lipoUSo_clean –adjust –logistic –ci 0.95 –out Merged/USociety_vs_Lipoedema_final/Association_analysis/lipoUSo_clean_for_lambda
Association Analysis Results
#Load the assocation results from both --logistic and --glm analyses and merge them into R
if(run==T){
strict_top300 <- results_annotation(paste0(file.path(analysis_dir,finalname),".PHENO1.glm.logistic.hybrid"),
paste0(file.path(analysis_dir,finalname),".assoc.logistic"))
strict_top30_assoc <- head(strict_top300[order(strict_top300$P_assoc),],35)
strict_top30_glm <- head(strict_top300[order(strict_top300$P_glm),],35)
strict_top30 <- strict_top300[strict_top300$ID %in% unique(strict_top30_assoc$ID,strict_top30_glm$ID),]
strict_top30 <- strict_top30[!duplicated(strict_top30$ID),]
strict_top30 <- strict_top30[order(strict_top30$P_assoc),]
#Write the top30 snps
write.table(strict_top30, paste0(file.path(analysis_dir,finalname),".PHENO1.glm.logistic",".top30annot.txt"),
sep="\t", quote=F, col.names = T, row.names = F)
#write_the_file_for the meta-analysis
discovery_meta = cbind(strict_top30[,c(1,7,4)],strict_top30$CASE_ALLELE_CT/2,strict_top30[,c(23,24,22)])
colnames(discovery_meta) = c("SNP","A1","REF","N","OR","SE","P")
write.table(discovery_meta,
paste0(meta_dir,"/lipoUSo_clean.PHENO1.glm.logistic.top30annot.case_weight.or.metal.txt"),sep="\t",
quote=F,row.names = F)
}
#Get data ready for plot
strict_results <- data.frame(fread(paste0(file.path(analysis_dir,finalname),".assoc.logistic"), header=T), stringsAsFactors = FALSE)
strict_results_top <- head(strict_results[order(strict_results$P),],10000)
plot_strict = strict_results[,c(2,1,3,12)]
plot_strict = plot_strict[!plot_strict$CHR=='26',]
plot_strict[plot_strict$CHR=='23',]$CHR = "X"
plot_strict_top = head(plot_strict[order(plot_strict$P),],10000)
#highlight snps with meta_p<1e-4 and same direction
highlight_snps=c("rs1409440","rs7994616","rs11616618","rs9308098","rs10521271","rs11511253","rs4554078","rs2019845","rs10499948","rs16825349")
#Draw manhattan plot
par(mar=c(5.1, 5.1, 4.1, 2.1))
CMplot(plot_strict,plot.type="m",threshold=c(1e-8,1e-4),threshold.lty=c(0,2),
threshold.lwd=c(1,1), threshold.col=c("black","black"),amplify=F,
signal.col=NULL,highlight.col=NULL,file.output = F,dpi=900,
pch=21,cex=.9,ylim=c(0,6),col=c("#648fff","#ffb000"),signal.pch = 19,signal.cex=.9,
highlight = highlight_snps, highlight.cex = 1.5, highlight.pch = 19, cex.axis = 1, cex.lab=1.3)
## (warning: all phenotypes will use the same ylim.)
## Rectangular-Manhattan Plotting P.
#Draw qq plot
par(mar=c(5.1, 5.1, 4.1, 2.1))
CMplot(plot_strict,col = "#648fff",plot.type="q",box=FALSE,file.output = F,file='tiff',memo="",dpi=900,
conf.int=TRUE,conf.int.col=NULL,threshold.col="red",threshold.lty=2,
width=8,height=5)
## QQ Plotting P.
text(5.3,0,"lambda = 1.00364",cex = 1.5)
We’re gonna perform meta analysis on METAL software. Association Results for the 25 variants crossing the suggestive cutoff from the discvoery study and the results from the replication study for the same variants have been reformed in a format accepted by the METAL software and the new files have been placed in the Meta-analysis directory.
Create the METAL script
if(run==T){
sink(file.path(meta_dir,"metal_analysis.or.txt"))
cat("SCHEME STDERR")
cat("\n\n")
cat("MARKER\tSNP
WEIGHT\tN
ALLELE\tA1 REF
EFFECT\tlog(OR)
STDERR\tSE
PVAL\tP\n")
cat("\n")
cat(paste0("PROCESS ",file.path(meta_dir,finalname),
".PHENO1.glm.logistic.top30annot.case_weight.or.metal.txt\n"))
cat(paste0("PROCESS ",meta_dir,"/gel_lipo_gwas_strict_all_assoc_for_meta.txt\n"))
cat("\n")
cat(paste0("OUTFILE ",meta_dir,"/final_meta-analysis_ .tbl\n\n"))
cat("ANALYZE")
sink()
}
Run METAL software
temp_comm = paste0(metal," ",file.path(meta_dir,"metal_analysis.or.txt"))
./resources/software/generic-metal/metal Merged/USociety_vs_Lipoedema_final/Meta-analysis/metal_analysis.or.txt
Load and merge all data from discovery, replication and meta-analysis analyses
if(run==T){
discovery = read.table(paste0(file.path(analysis_dir,finalname),".PHENO1.glm.logistic.top30annot.txt"),
sep="\t", stringsAsFactors = F, header=1)
replication = read.table(paste0(meta_dir,"/gel_lipo_gwas_strict_all_assoc.txt"),
stringsAsFactors = F,sep="\t",header=1)
meta_analysis = read.table(paste0(meta_dir,"/final_meta-analysis_1.tbl"),
stringsAsFactors = F, sep="\t",header=1)
disc_rep = merge(discovery, replication, by.x="ID",by.y="ID",suffixes=c("_SGUL","_GEL"))
final_results = merge(disc_rep, meta_analysis, by.x="ID", by.y="MarkerName")
write.table(final_results,paste0(meta_dir,"/final_all_merged.tsv"),sep="\t",quote=F,row.names = F)
}